# ----------------------------------------------------------
# Cleaning the LBNL Solar Installations data 
# ----------------------------------------------------------
library(pacman)
p_load(
  data.table, here, lubridate, tidyverse, hrbrthemes,
  tigris, sf, DescTools
)

theme_set(hrbrthemes::theme_ipsum())
options(tigris_use_cache = TRUE)

# Reading in the raw data
install_dt_raw = rbind(
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p1.csv")),
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p2.csv"))
)

# Reading google data
gps_zip_dt = fread(
    here("Data/google-project-sunroof/project-sunroof-postal_code.csv")
  ) %>% 
  .[,zipcode := str_pad(region_name, width = 5, side = "left",pad = "0")]

# Tigris
zip_sf = zctas(cb = TRUE) |>
  rename(zipcode = ZCTA5CE10) |>
  st_simplify(dTolerance = 100) |>
  st_transform(crs = 2163)

state_sf = states(cb = TRUE)|>
  st_simplify(dTolerance = 100) |>
  st_transform(crs = 2163)


# Replacing all -1 with NA
install_dt_raw[install_dt_raw == -1] = NA

# What is this data?
skimr::skim(install_dt_raw)

# Filtering to residential and 2000-2020 
install_dt = 
  install_dt_raw[
    customerSegment == "RES" & year(dmy(PTODate_orProxy_)) %in% 2000:2020,
    .(
      install_date = dmy(PTODate_orProxy_),
      zipcode = hostCustomerZip__4_,
      city = hostCustomerCity,
      state,
      system_size_dc = systemSizeInDCSTC_KW_,
      total_cost = totalInstalledCost___,
      subsidy = Up_FrontCashIncentive___,
      cost_per_kw = totalInstalledCost___/systemSizeInDCSTC_KW_,
      total_panels = coalesce(moduleQty_1,0) + coalesce(moduleQty_2,0) + coalesce(moduleQty_3,0) 
    )
  ]

# Winsorizing because there are a few outliers  
win_pr = 0.01
pr = c(win_pr/2,1-win_pr/2)
install_dt[,':='(
  total_cost_w = Winsorize(total_cost, na.rm = TRUE, probs = pr),
  total_panels_w = Winsorize(total_panels, na.rm = TRUE, probs = pr)
)]


# Avg cost and subsidy by state
install_dt[,.(
    avg_cost = mean(total_cost, na.rm =T), 
    avg_subsidy = mean(subsidy, na.rm = T),
    avg_size = mean(system_size_dc, na.rm = T),
    .N
  ),
  by=state
]

# Plotting total installs per year 
install_dt[, .(
      count = .N,
      count_price = sum(!is.na(total_cost)),
      count_subsidy = sum(!is.na(subsidy))
    ), 
    by=year(install_date)
  ] |>
  pivot_longer(cols = 2:4)|>
  ggplot(aes(x=year,y = value, color = name)) +
  geom_line() 

# Plotting size trends 
install_dt[,.(
    size_25 = quantile(system_size_dc, probs = 0.25, na.rm = T),
    size_50 = quantile(system_size_dc, probs = 0.5, na.rm = T),
    size_75 = quantile(system_size_dc, probs = 0.75, na.rm = T)
  ),
  by = year(install_date)
] |>
  ggplot(aes(x = year)) +
  geom_line(aes(y = size_50/1e3), size = 2) + 
  geom_ribbon(aes(ymin = size_25/1e3, ymax = size_75/1e3), fill = "blue", alpha = 0.4) +
  labs(
    title = "System Sizes by Year",
    x = "Year",
    y = "kW",
    caption = "Line shows the median while the range shows the 25th to 75th percentile."
  )

# Plotting the average price and subsidy by year 
rbind(
  install_dt[,.(
      type = "Price",
      price_25 = quantile(cost_per_kw, probs = 0.25, na.rm = T),
      price_50 = quantile(cost_per_kw, probs = 0.5, na.rm = T),
      price_75 = quantile(cost_per_kw, probs = 0.75, na.rm = T)
    ),
    by = year(install_date)
  ],
  install_dt[,.(
    type = "Subsidy",
    price_25 = quantile(subsidy/system_size_dc, probs = 0.25, na.rm = T),
    price_50 = quantile(subsidy/system_size_dc, probs = 0.5, na.rm = T),
    price_75 = quantile(subsidy/system_size_dc, probs = 0.75, na.rm = T)
  ),
  by = year(install_date)
  ] 
) |>
  ggplot(aes(x = year, color = type, fill = type)) +
  geom_line(aes(y = price_50/1e3), size = 2) + 
  geom_ribbon(aes(ymin = price_25/1e3, ymax = price_75/1e3), alpha = 0.4, color = NA) +
  scale_color_manual(name = "", values = c("#048BA8","#F18F01")) +
  scale_fill_manual(name = "", values = c("#048BA8","#F18F01")) +
  labs(
    title = "Install Prices and Subsidies by Year",
    x = "Year",
    y = "Nominal $/W",
    caption = "Line shows the median while the range shows the 25th to 75th percentile."  )

# What has happened to subsidies
install_dt[,.(
  tot_subsidy = sum(subsidy,na.rm=T)/sum(system_size_dc,na.rm=T)),
  keyby=.(state,year(install_date))
] |> 
  ggplot(aes(x = year, y = tot_subsidy/1e3, color = state)) + 
  geom_line() +
  labs(
    title = "Subsidies have been declining",
    x = "Year",
    y = "Subsidy per W",
    color = "State"
  )

# Getting summary data by zip code 
install_zip_dt = 
  install_dt[
    year(install_date) <= 2015,
    .(
      avg_cost = mean(total_cost, na.rm =T), 
      avg_subsidy = mean(subsidy, na.rm = T),
      avg_size = mean(system_size_dc, na.rm = T),
      avg_cost_kw = mean(cost_per_kw, na.rm = T),
      count = .N,
      count_price = sum(!is.na(total_cost)),
      count_subsidy = sum(!is.na(subsidy))
    ),
    by=str_sub(zipcode,1,5)
  ] |> 
  setnames("str_sub","zipcode")

install_state_dt = 
  install_dt[
    #year(install_date) %in% 2010:2015
    ,.(
      avg_cost = mean(total_cost, na.rm =T), 
      avg_subsidy = mean(subsidy, na.rm = T),
      avg_size = mean(system_size_dc, na.rm = T),
      avg_cost_kw = mean(cost_per_kw, na.rm = T),
      count = .N,
      count_price = sum(!is.na(total_cost)),
      count_subsidy = sum(!is.na(subsidy))
    ),
    by=state
  ]

# Merging with google data 
solar_zip_dt = 
  merge(
    gps_zip_dt,
    install_zip_dt,
    by = "zipcode"
  ) 

# Checking correlation between google and install data 
ggplot(
  data = solar_zip_dt[!is.na(zipcode)], 
  aes(x = existing_installs_count, y = count)
) + 
  geom_point(alpha = 0.3) +
  geom_abline(linetype = "dashed") + 
  geom_smooth(method = "lm") + 
  labs(
    title = "Tracking the Sun vs Google Project Sunroof Install Counts",
    x = "Tracking the Sun Installs",
    y = "Google Project Sunroof Installs",
  )



# Merging with shapes
solar_zip_sf = 
  full_join(
    zip_sf,
    solar_zip_dt,
    by = "zipcode"
  )
install_state_sf = 
  full_join(
    state_sf |> filter(!(STATEFP %in% c("60","66","69","72","78","02","15"))),
    install_state_dt,
    by = c("STUSPS"="state")
  )

# Maps
ggplot() +
  geom_sf(data = install_state_sf, aes(fill = avg_cost_kw/1e6), color = NA) +
  scale_fill_viridis_c("Cost Per kW")
ggplot() +
  geom_sf(data = install_state_sf, aes(fill = 1- count_price/count), color = NA) +
  scale_fill_viridis_c("Missing Price")
ggplot() +
  geom_sf(data = install_state_sf, aes(fill = 1- count_subsidy/count), color = NA) +
  scale_fill_viridis_c("Missing Subsidy")
ggplot() +
  geom_sf(data = install_state_sf, aes(fill = count/1e3), color = NA) +
  scale_fill_viridis_c("Total Installations ('000s)")




install_dt[,.(
  tot = .N,
  tot_price = sum(!is.na(total_cost)),
  tot_panels = sum(!is.na(total_panels) & total_panels > 0)
)],
by=state]





